#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import sklearn
from random import shuffle
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import missingno as msno
df = pd.read_excel('Bankruptcy dataset.xlsx')
df.head(100)
| Attr1 | Attr2 | Attr3 | Attr4 | Attr5 | Attr6 | Attr7 | Attr8 | Attr9 | Attr10 | ... | Attr56 | Attr57 | Attr58 | Attr59 | Attr60 | Attr61 | Attr62 | Attr63 | Attr64 | bankruptcy? | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.009746 | 0.59288 | 0.006111 | 1.01260 | -2157.3000 | -0.020388 | 0.011053 | 0.686680 | 1.57430 | 0.407120 | ... | 0.314050 | 0.023939 | 0.70510 | 0.190040 | 9.8813 | 5.4101 | 112.370 | 3.24820 | 3.0917 | 0 |
| 1 | 0.084104 | 0.47244 | 0.275890 | 1.64280 | -3.9840 | 0.339550 | 0.084104 | 1.116700 | 3.21940 | 0.527560 | ... | 0.029764 | 0.159420 | 0.97394 | 0.053003 | 10.6140 | 8.9620 | 48.661 | 7.50080 | 10.9160 | 0 |
| 2 | -0.221670 | 0.86884 | 0.125460 | 1.48690 | -2.8629 | -0.256780 | -0.202700 | 0.048645 | 0.87939 | 0.042265 | ... | -0.137150 | -5.244800 | 1.13710 | 14.461000 | 10.1470 | 6.0056 | 76.780 | 4.75380 | 1.9856 | 0 |
| 3 | 0.011514 | 0.80752 | -0.011056 | 0.98631 | -251.7900 | 0.029182 | 0.014275 | 0.234480 | 1.03500 | 0.189350 | ... | 0.033838 | 0.060808 | 0.96616 | 0.000000 | 1.5238 | 2.8550 | 373.180 | 0.97809 | 3.8805 | 0 |
| 4 | 0.245470 | 0.51585 | 0.244940 | 1.52010 | 5.2834 | 0.342680 | 0.293350 | 0.789690 | 1.13510 | 0.407360 | ... | 0.119020 | 0.602580 | 0.88098 | 0.110190 | 13.0540 | 5.3119 | 65.837 | 5.54400 | 9.1907 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | -0.489630 | 0.88447 | -0.039895 | 0.92472 | -79.1430 | 0.000000 | -0.489630 | 0.130620 | 2.11530 | 0.115530 | ... | -0.192580 | -4.238100 | 1.23150 | 2.791000 | NaN | NaN | 91.449 | 3.99130 | 4.1481 | 0 |
| 96 | 0.024406 | 0.67834 | 0.014262 | 1.02860 | -19.1890 | 0.000000 | 0.029385 | 0.474180 | 2.26790 | 0.321660 | ... | -0.041003 | 0.075875 | 0.98802 | 0.428870 | 19.4650 | 6.2855 | 80.339 | 4.54320 | 4.6609 | 0 |
| 97 | 0.321240 | 0.24355 | 0.266960 | 2.25570 | -10.0010 | 0.000000 | 0.397510 | 3.105900 | 1.35680 | 0.756450 | ... | 0.324910 | 0.424670 | 0.67498 | 0.025341 | 11.3450 | 7.2292 | 57.191 | 6.38210 | 2.6069 | 0 |
| 98 | -0.115880 | 0.44584 | -0.087628 | 0.77779 | -39.7480 | 0.000000 | -0.115880 | 1.242900 | 1.79350 | 0.554160 | ... | -0.044591 | -0.209110 | 1.06410 | 0.092911 | 17.2300 | 15.8760 | 80.256 | 4.54790 | 2.5870 | 0 |
| 99 | 0.251490 | 0.57915 | 0.363980 | 1.62850 | 43.0830 | 0.000000 | 0.310470 | 0.726680 | 2.48990 | 0.420850 | ... | 0.126410 | 0.597560 | 0.87580 | 0.000000 | 23.6480 | 2.9921 | 84.899 | 4.29920 | 43.7770 | 0 |
100 rows × 65 columns
Total number of rows 12034 and columns are 14.
df['bankruptcy?'].value_counts()
0 30985 1 1568 Name: bankruptcy?, dtype: int64
The dataset is highly skewed/imbalanced, only 857 correct matches are present.
df.shape
(32553, 65)
df.columns
Index(['Attr1', 'Attr2', 'Attr3', 'Attr4', 'Attr5', 'Attr6', 'Attr7', 'Attr8',
'Attr9', 'Attr10', 'Attr11', 'Attr12', 'Attr13', 'Attr14', 'Attr15',
'Attr16', 'Attr17', 'Attr18', 'Attr19', 'Attr20', 'Attr21', 'Attr22',
'Attr23', 'Attr24', 'Attr25', 'Attr26', 'Attr27', 'Attr28', 'Attr29',
'Attr30', 'Attr31', 'Attr32', 'Attr33', 'Attr34', 'Attr35', 'Attr36',
'Attr37', 'Attr38', 'Attr39', 'Attr40', 'Attr41', 'Attr42', 'Attr43',
'Attr44', 'Attr45', 'Attr46', 'Attr47', 'Attr48', 'Attr49', 'Attr50',
'Attr51', 'Attr52', 'Attr53', 'Attr54', 'Attr55', 'Attr56', 'Attr57',
'Attr58', 'Attr59', 'Attr60', 'Attr61', 'Attr62', 'Attr63', 'Attr64',
'bankruptcy?'],
dtype='object')
df.describe()
| Attr1 | Attr2 | Attr3 | Attr4 | Attr5 | Attr6 | Attr7 | Attr8 | Attr9 | Attr10 | ... | Attr56 | Attr57 | Attr58 | Attr59 | Attr60 | Attr61 | Attr62 | Attr63 | Attr64 | bankruptcy? | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 32547.000000 | 32547.000000 | 32547.000000 | 32451.000000 | 3.248400e+04 | 32547.000000 | 32547.000000 | 32480.000000 | 32547.000000 | 32547.000000 | ... | 32456.000000 | 32548.000000 | 32487.000000 | 32548.000000 | 3.095500e+04 | 32476.000000 | 3.245600e+04 | 32451.000000 | 31940.000000 | 32553.000000 |
| mean | 0.039199 | 0.605968 | 0.109072 | 6.762114 | -5.425717e+02 | -0.053564 | 0.124465 | 13.527264 | 2.857175 | 0.657756 | ... | -0.769956 | 0.083595 | 3.230844 | 1.494216 | 3.479483e+02 | 18.602778 | 1.026340e+03 | 9.376223 | 82.368128 | 0.048168 |
| std | 2.242642 | 5.772857 | 5.744205 | 337.157408 | 7.024461e+04 | 6.716166 | 5.264709 | 559.851493 | 71.862465 | 15.873714 | ... | 71.229736 | 11.458138 | 140.257018 | 140.423466 | 2.837269e+04 | 637.515542 | 7.772792e+04 | 138.791417 | 2713.709056 | 0.214124 |
| min | -256.890000 | -72.162000 | -479.960000 | -0.403110 | -1.190300e+07 | -508.410000 | -189.560000 | -141.410000 | -0.612590 | -479.910000 | ... | -8534.600000 | -1667.300000 | -198.690000 | -327.970000 | -1.244000e+01 | -12.656000 | -2.336500e+06 | -1.543200 | -10677.000000 | 0.000000 |
| 25% | 0.003688 | 0.269665 | 0.021853 | 1.051300 | -4.868525e+01 | 0.000000 | 0.006154 | 0.432910 | 1.019500 | 0.296385 | ... | 0.009589 | 0.015130 | 0.874770 | 0.000000 | 5.544400e+00 | 4.502050 | 4.227125e+01 | 3.098750 | 2.186950 | 0.000000 |
| 50% | 0.050336 | 0.472240 | 0.196940 | 1.569800 | -1.149400e+00 | 0.000000 | 0.060533 | 1.069150 | 1.200400 | 0.505960 | ... | 0.053665 | 0.121065 | 0.950050 | 0.006729 | 9.745700e+00 | 6.612000 | 7.133550e+01 | 5.084500 | 4.288200 | 0.000000 |
| 75% | 0.130165 | 0.687770 | 0.403850 | 2.791450 | 5.034125e+01 | 0.088701 | 0.151700 | 2.607075 | 2.057150 | 0.709110 | ... | 0.129737 | 0.285462 | 0.992450 | 0.236650 | 2.015250e+01 | 10.446250 | 1.172000e+02 | 8.569150 | 9.680700 | 0.000000 |
| max | 94.280000 | 480.960000 | 28.336000 | 53433.000000 | 1.250100e+06 | 322.200000 | 649.230000 | 53432.000000 | 9742.300000 | 1099.500000 | ... | 293.150000 | 552.640000 | 18118.000000 | 23853.000000 | 4.818700e+06 | 108000.000000 | 1.077900e+07 | 23454.000000 | 294770.000000 | 1.000000 |
8 rows × 65 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32553 entries, 0 to 32552 Data columns (total 65 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attr1 32547 non-null float64 1 Attr2 32547 non-null float64 2 Attr3 32547 non-null float64 3 Attr4 32451 non-null float64 4 Attr5 32484 non-null float64 5 Attr6 32547 non-null float64 6 Attr7 32547 non-null float64 7 Attr8 32480 non-null float64 8 Attr9 32547 non-null float64 9 Attr10 32547 non-null float64 10 Attr11 32522 non-null float64 11 Attr12 32451 non-null float64 12 Attr13 32456 non-null float64 13 Attr14 32547 non-null float64 14 Attr15 32524 non-null float64 15 Attr16 32480 non-null float64 16 Attr17 32480 non-null float64 17 Attr18 32547 non-null float64 18 Attr19 32455 non-null float64 19 Attr20 32456 non-null float64 20 Attr21 28112 non-null float64 21 Attr22 32547 non-null float64 22 Attr23 32456 non-null float64 23 Attr24 31863 non-null float64 24 Attr25 32547 non-null float64 25 Attr26 32480 non-null float64 26 Attr27 30480 non-null float64 27 Attr28 31940 non-null float64 28 Attr29 32547 non-null float64 29 Attr30 32456 non-null float64 30 Attr31 32456 non-null float64 31 Attr32 32262 non-null float64 32 Attr33 32451 non-null float64 33 Attr34 32480 non-null float64 34 Attr35 32547 non-null float64 35 Attr36 32547 non-null float64 36 Attr37 18343 non-null float64 37 Attr38 32547 non-null float64 38 Attr39 32456 non-null float64 39 Attr40 32451 non-null float64 40 Attr41 31993 non-null float64 41 Attr42 32456 non-null float64 42 Attr43 32456 non-null float64 43 Attr44 32456 non-null float64 44 Attr45 30959 non-null float64 45 Attr46 32450 non-null float64 46 Attr47 32315 non-null float64 47 Attr48 32547 non-null float64 48 Attr49 32456 non-null float64 49 Attr50 32480 non-null float64 50 Attr51 32547 non-null float64 51 Attr52 32312 non-null float64 52 Attr53 31940 non-null float64 53 Attr54 31940 non-null float64 54 Attr55 32552 non-null float64 55 Attr56 32456 non-null float64 56 Attr57 32548 non-null float64 57 Attr58 32487 non-null float64 58 Attr59 32548 non-null float64 59 Attr60 30955 non-null float64 60 Attr61 32476 non-null float64 61 Attr62 32456 non-null float64 62 Attr63 32451 non-null float64 63 Attr64 31940 non-null float64 64 bankruptcy? 32553 non-null int64 dtypes: float64(64), int64(1) memory usage: 16.1 MB
df.isna().sum()
Attr1 6
Attr2 6
Attr3 6
Attr4 102
Attr5 69
...
Attr61 77
Attr62 97
Attr63 102
Attr64 613
bankruptcy? 0
Length: 65, dtype: int64
msno.bar(df)
<Axes: >
#Replace missing values with mean
df = df.fillna(df.mean())
# count of people doing overtime
sns.countplot(df['bankruptcy?'])
fig = plt.gcf()
plt.title('Target Labels')
#plt.savefig("./images/output1.png")
plt.show()
f,ax = plt.subplots(figsize=(50, 50))
a =sns.heatmap(df.corr(), annot=True,ax=ax)
plt.title("Heatmap")
#plt.savefig("./images/output2.png")
plt.show()
df.columns
Index(['Attr1', 'Attr2', 'Attr3', 'Attr4', 'Attr5', 'Attr6', 'Attr7', 'Attr8',
'Attr9', 'Attr10', 'Attr11', 'Attr12', 'Attr13', 'Attr14', 'Attr15',
'Attr16', 'Attr17', 'Attr18', 'Attr19', 'Attr20', 'Attr21', 'Attr22',
'Attr23', 'Attr24', 'Attr25', 'Attr26', 'Attr27', 'Attr28', 'Attr29',
'Attr30', 'Attr31', 'Attr32', 'Attr33', 'Attr34', 'Attr35', 'Attr36',
'Attr37', 'Attr38', 'Attr39', 'Attr40', 'Attr41', 'Attr42', 'Attr43',
'Attr44', 'Attr45', 'Attr46', 'Attr47', 'Attr48', 'Attr49', 'Attr50',
'Attr51', 'Attr52', 'Attr53', 'Attr54', 'Attr55', 'Attr56', 'Attr57',
'Attr58', 'Attr59', 'Attr60', 'Attr61', 'Attr62', 'Attr63', 'Attr64',
'bankruptcy?'],
dtype='object')
from imblearn.over_sampling import SMOTE
- Recall is metric that we need to consider, the higher the recall the better the model is..
Using Cross validation to see the best model
# Test options and evaluation metric
num_folds = 10
seed = np.random.seed
scoring = 'recall'
validation_size = 0.15
#df['bankruptcy?']= (df.matched_transaction_id == df.feature_transaction_id).astype(int)
x,y = df.drop('bankruptcy?', axis=1), df['bankruptcy?']
#x=x.drop(['target', 'DifferentPredictedTime','DifferentPredictedDate'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, random_state=42,shuffle=True)
# Spot-Check Algorithms
models = []
models.append(( ' LR ' , LogisticRegression()))
models.append(( ' LDA ' , LinearDiscriminantAnalysis()))
models.append(( ' KNN ' , KNeighborsClassifier()))
models.append(( ' CART ' , DecisionTreeClassifier()))
models.append(( ' NB ' , GaussianNB()))
models.append(( ' SVM ' , SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=10)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='recall')
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
LR : 0.005129 (0.005857) LDA : 0.009254 (0.009266) KNN : 0.082951 (0.021880) CART : 0.507805 (0.042514) NB : 0.963884 (0.014869) SVM : 0.000000 (0.000000)
# Standardize the dataset
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',
LogisticRegression())])))
#pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA',
#LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',
DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
GaussianNB())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC())])))
results = []
names = []
for name, model in pipelines:
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
ScaledLR: 0.007454 (0.006691) ScaledKNN: 0.040415 (0.013379) ScaledCART: 0.505869 (0.031354) ScaledNB: 0.959051 (0.014125) ScaledSVM: 0.000000 (0.000000)
# Compare Algorithms
fig = plt.figure()
fig.suptitle( ' Scaled Algorithm Comparison ' )
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
#plt.savefig("./images/output4.png")
plt.show()
# ensembles
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))
results = []
names = []
for name, model in ensembles:
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
AB: 0.179768 (0.033360) GBM: 0.390717 (0.042844) RF: 0.316912 (0.044485) ET: 0.121258 (0.027979)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Ensemble Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
ensembles = []
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()),('AB', AdaBoostClassifier())])))
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingClassifier())])))
ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()),('ET', ExtraTreesClassifier())])))
out=[]
Names =[]
for Name, algos in ensembles:
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(algos, X_train, Y_train, cv=kfold, scoring=scoring)
out.append(cv_results)
Names.append(Name)
msg = "%s: %f (%f)" % (Name, cv_results.mean(), cv_results.std())
print(msg)
ScaledAB: 0.180811 (0.031240) ScaledGBM: 0.396963 (0.035912) ScaledRF: 0.314226 (0.044579) ScaledET: 0.123142 (0.032020)
scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) neighbors = [1,3,5,7,9,11,13,15,17,19,21] param_grid = dict(n_neighbors=neighbors) model = KNeighborsClassifier() kfold = KFold(n_splits=num_folds) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(rescaledX, Y_train) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
Convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.
from xgboost import XGBClassifier
model = XGBClassifier()
# fit the model
model.fit(X_train, Y_train)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
#plt.savefig("./images/output6.png")
plt.show()
Feature: 0, Score: 0.00955 Feature: 1, Score: 0.00847 Feature: 2, Score: 0.01095 Feature: 3, Score: 0.00931 Feature: 4, Score: 0.03610 Feature: 5, Score: 0.02055 Feature: 6, Score: 0.01024 Feature: 7, Score: 0.01044 Feature: 8, Score: 0.01288 Feature: 9, Score: 0.01532 Feature: 10, Score: 0.01168 Feature: 11, Score: 0.01508 Feature: 12, Score: 0.01903 Feature: 13, Score: 0.00000 Feature: 14, Score: 0.00854 Feature: 15, Score: 0.00773 Feature: 16, Score: 0.01073 Feature: 17, Score: 0.00000 Feature: 18, Score: 0.01894 Feature: 19, Score: 0.01884 Feature: 20, Score: 0.01791 Feature: 21, Score: 0.01661 Feature: 22, Score: 0.00828 Feature: 23, Score: 0.01712 Feature: 24, Score: 0.01943 Feature: 25, Score: 0.05051 Feature: 26, Score: 0.03617 Feature: 27, Score: 0.01037 Feature: 28, Score: 0.01095 Feature: 29, Score: 0.01164 Feature: 30, Score: 0.01100 Feature: 31, Score: 0.01460 Feature: 32, Score: 0.02325 Feature: 33, Score: 0.07578 Feature: 34, Score: 0.03951 Feature: 35, Score: 0.00918 Feature: 36, Score: 0.00889 Feature: 37, Score: 0.01854 Feature: 38, Score: 0.01745 Feature: 39, Score: 0.00965 Feature: 40, Score: 0.01506 Feature: 41, Score: 0.02126 Feature: 42, Score: 0.00825 Feature: 43, Score: 0.02540 Feature: 44, Score: 0.01232 Feature: 45, Score: 0.01832 Feature: 46, Score: 0.00945 Feature: 47, Score: 0.01022 Feature: 48, Score: 0.01232 Feature: 49, Score: 0.00828 Feature: 50, Score: 0.00835 Feature: 51, Score: 0.01426 Feature: 52, Score: 0.00905 Feature: 53, Score: 0.00984 Feature: 54, Score: 0.00878 Feature: 55, Score: 0.04415 Feature: 56, Score: 0.00671 Feature: 57, Score: 0.02389 Feature: 58, Score: 0.00731 Feature: 59, Score: 0.00677 Feature: 60, Score: 0.01025 Feature: 61, Score: 0.00850 Feature: 62, Score: 0.01213 Feature: 63, Score: 0.00791
train = xgboost.DMatrix(X_train, label=Y_train)
test = xgboost.DMatrix(X_test, label=Y_test)
watchlist = [(test, 'eval'), (train, 'train')]
xgb_params = {
'objective': 'binary:logistic',
'tree_method': 'hist',
'max_depth': 2,
'eta':0.1,
'subsample':0.5,
'colsample_bytree': 0.05,
}
clf = xgboost.train( xgb_params,train, num_boost_round=10000, )
preds = clf.predict(test)
print(classification_report((Y_test),preds.round(0)))
print( " In the Confusion Matrix below, the digonal values represent correct classification for each class : ")
labels = ['Not Bankrupt', 'Bankrupt']
cm = sklearn.metrics.confusion_matrix((Y_test),(preds.round(0).astype(int)))
ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax);
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
#plt.savefig("./images/output7.png")
plt.show()
precision recall f1-score support
0 0.98 0.99 0.98 7763
1 0.74 0.51 0.60 376
accuracy 0.97 8139
macro avg 0.86 0.75 0.79 8139
weighted avg 0.97 0.97 0.97 8139
In the Confusion Matrix below, the digonal values represent correct classification for each class :
train = xgboost.DMatrix(X_train, label=Y_train)
test = xgboost.DMatrix(X_test, label=Y_test)
watchlist = [(test, 'eval'), (train, 'train')]
xgb_params = {
'objective': 'binary:logistic',
'tree_method': 'hist',
'max_depth': 2,
'eta':0.1,
'subsample':0.5,
'colsample_bytree': 0.05,
'scale_pos_weight':8,
}
clf = xgboost.train( xgb_params,train )
preds = clf.predict(test)
print(classification_report((Y_test),preds.round(0)))
labels = ['Not Bankrupt', 'Bankrupt']
cm = sklearn.metrics.confusion_matrix((Y_test),(preds.round(0).astype(int)))
ax= plt.subplot()
sns.heatmap(cm.astype(int), annot=True,fmt='g', ax = ax); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
#plt.savefig("./images/output8.png")
plt.show()
precision recall f1-score support
0 0.96 0.96 0.96 7763
1 0.18 0.19 0.18 376
accuracy 0.92 8139
macro avg 0.57 0.57 0.57 8139
weighted avg 0.92 0.92 0.92 8139
import pickle
#file_name = "./model/xgb_model.pkl"
#pickle.dump(clf, open(file_name, "wb"))
#Balancing the data set and then applying the models..